<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <meta name="author" content="PG-Strom Development Team">
  <link rel="shortcut icon" href="../img/favicon.ico">
  <title>Trouble Shooting - PG-Strom Manual</title>
  <link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>

  <link rel="stylesheet" href="../css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
  <link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
  <link href="//fonts.googleapis.com/earlyaccess/notosansjp.css" rel="stylesheet">
  <link href="//fonts.googleapis.com/css?family=Open+Sans:600,800" rel="stylesheet">
  <link href="../custom.css" rel="stylesheet">
  
  <script>
    // Current page data
    var mkdocs_page_name = "Trouble Shooting";
    var mkdocs_page_input_path = "troubles.md";
    var mkdocs_page_url = null;
  </script>
  
  <script src="../js/jquery-2.1.1.min.js" defer></script>
  <script src="../js/modernizr-2.8.3.min.js" defer></script>
  <script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
  <script>hljs.initHighlightingOnLoad();</script> 
  
</head>

<body class="wy-body-for-nav" role="document">

  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
      <div class="wy-side-nav-search">
        <a href=".." class="icon icon-home"> PG-Strom Manual</a>
        <div role="search">
  <form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
  </form>

  [<a href="../ja" style="color: #cccccc">Japanese</a> | <strong>English</strong>]

</div>
      </div>

      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
	<ul class="current">
	  
          
            <li class="toctree-l1">
		
    <a class="" href="..">Home</a>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../install/">Install</a>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">Tutorial</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../operations/">Basic Operations</a>
                </li>
                <li class="">
                    
    <a class="" href="../sys_admin/">System Administration</a>
                </li>
                <li class="">
                    
    <a class="" href="../brin/">Index Support</a>
                </li>
                <li class="">
                    
    <a class="" href="../partition/">Partitioning</a>
                </li>
                <li class=" current">
                    
    <a class="current" href="./">Trouble Shooting</a>
    <ul class="subnav">
            
    <li class="toctree-l3"><a href="#identify-the-problem">Identify the problem</a></li>
    

    <li class="toctree-l3"><a href="#collecting-crash-dump">Collecting crash dump</a></li>
    
        <ul>
        
            <li><a class="toctree-l4" href="#add-configuration-on-postgresql-startup">Add configuration on PostgreSQL startup</a></li>
        
            <li><a class="toctree-l4" href="#installation-of-debuginfo-package">Installation of debuginfo package</a></li>
        
            <li><a class="toctree-l4" href="#checking-the-back-trace-on-cpu-side">Checking the back-trace on CPU side</a></li>
        
            <li><a class="toctree-l4" href="#checking-the-backtrace-on-gpu">Checking the backtrace on GPU</a></li>
        
        </ul>
    

    </ul>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">Advanced Features</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../ssd2gpu/">SSD2GPU Direct SQL</a>
                </li>
                <li class="">
                    
    <a class="" href="../arrow_fdw/">Arrow_fdw</a>
                </li>
                <li class="">
                    
    <a class="" href="../gstore_fdw/">Gstore_fdw</a>
                </li>
                <li class="">
                    
    <a class="" href="../plcuda/">PL/CUDA</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <span class="caption-text">References</span>
    <ul class="subnav">
                <li class="">
                    
    <a class="" href="../ref_types/">Data Types</a>
                </li>
                <li class="">
                    
    <a class="" href="../ref_devfuncs/">Functions and Operators</a>
                </li>
                <li class="">
                    
    <a class="" href="../ref_sqlfuncs/">SQL Objects</a>
                </li>
                <li class="">
                    
    <a class="" href="../ref_params/">GUC Parameters</a>
                </li>
    </ul>
	    </li>
          
            <li class="toctree-l1">
		
    <a class="" href="../release_note/">Release Note</a>
	    </li>
          
        </ul>
      </div>
      &nbsp;
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
        <a href="..">PG-Strom Manual</a>
      </nav>

      
      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
    <li><a href="..">Docs</a> &raquo;</li>
    
      
        
          <li>Tutorial &raquo;</li>
        
      
    
    <li>Trouble Shooting</li>
    <li class="wy-breadcrumbs-aside">
      
    </li>
  </ul>
  <hr/>
</div>
          <div role="main">
            <div class="section">
              
                <h1>Trouble Shooting</h1>

<h1 id="identify-the-problem">Identify the problem</h1>
<p>In case when a particular workloads produce problems, it is the first step to identify which stuff may cause the problem.</p>
<p>Unfortunately, much smaller number of developer supports the PG-Strom development community than PostgreSQL developer's community, thus, due to the standpoint of software quality and history, it is a reasonable estimation to suspect PG-Strom first.</p>
<p>The <code>pg_strom.enabled</code> parameter allows to turn on/off all the functionality of PG-Strom at once.
The configuration below disables PG-Strom, thus identically performs with the standard PostgreSQL.</p>
<pre><code># SET pg_strom.enabled = off;
</code></pre>

<p>In addition, we provide parameters to disable particular execution plan like GpuScan, GpuJoin and GpuPreAgg.</p>
<p>See <a href="../ref_params/">references/GUC Parameters</a> for more details.</p>
<h1 id="collecting-crash-dump">Collecting crash dump</h1>
<p>Crash dump is very helpful for analysis of serious problems which lead system crash for example.
This session introduces the way to collect crash dump of the PostgreSQL and PG-Strom process (CPU side) and PG-Strom's GPU kernel, and show the back trace on the serious problems.</p>
<h2 id="add-configuration-on-postgresql-startup">Add configuration on PostgreSQL startup</h2>
<p>For generation of crash dump (CPU-side) on process crash, you need to change the resource limitation of the operating system for size of core file  PostgreSQL server process can generate.</p>
<p>For generation of crash dump (GPU-size) on errors of GPU kernel, PostgreSQL server process has <code>CUDA_ENABLE_COREDUMP_ON_EXCEPTION</code>environment variable, and its value has <code>1</code>.</p>
<p>You can put a configuration file at <code>/etc/systemd/system/postgresql-&lt;version&gt;.service.d/</code> when PostgreSQL is kicked by systemd.</p>
<p>In case of RPM installation, a configuration file <code>pg_strom.conf</code> is also installed on the directory, and contains the following initial configuration.</p>
<pre><code>[Service]
LimitNOFILE=65536
LimitCORE=infinity
#Environment=CUDA_ENABLE_COREDUMP_ON_EXCEPTION=1
</code></pre>

<p>In CUDA 9.1, it usually takes more than several minutes to generate crash dump of GPU kernel, and it entirely stops response of the PostgreSQL session which causes an error.
So, we recommend to set <code>CUDA_ENABLE_COREDUMP_ON_EXCEPTION</code> environment variable only if you investigate errors of GPU kernels which happen on a certain query.
The default configuration on RPM installation comments out the line of <code>CUDA_ENABLE_COREDUMP_ON_EXCEPTION</code> environment variable.</p>
<p>PostgreSQL server process should have unlimited <em>Max core file size</em> configuration, after the next restart.</p>
<p>You can check it as follows.</p>
<pre><code># cat /proc/&lt;PID of postmaster&gt;/limits
Limit                     Soft Limit           Hard Limit           Units
    :                         :                    :                  :
Max core file size        unlimited            unlimited            bytes
    :                         :                    :                  :
</code></pre>

<h2 id="installation-of-debuginfo-package">Installation of debuginfo package</h2>
<pre><code># yum install postgresql10-debuginfo pg_strom-PG10-debuginfo
            :
================================================================================
 Package                  Arch    Version             Repository           Size
================================================================================
Installing:
 pg_strom-PG10-debuginfo  x86_64  1.9-180301.el7      heterodb-debuginfo  766 k
 postgresql10-debuginfo   x86_64  10.3-1PGDG.rhel7    pgdg10              9.7 M

Transaction Summary
================================================================================
Install  2 Packages
            :
Installed:
  pg_strom-PG10-debuginfo.x86_64 0:1.9-180301.el7
  postgresql10-debuginfo.x86_64 0:10.3-1PGDG.rhel7

Complete!
</code></pre>

<h2 id="checking-the-back-trace-on-cpu-side">Checking the back-trace on CPU side</h2>
<p>The kernel parameter <code>kernel.core_pattern</code> and <code>kernel.core_uses_pid</code> determine the path where crash dump is written out.
It is usually created on the current working directory of the process, check <code>/var/lib/pgdata</code> where the database cluster is deployed, if you start PostgreSQL server using systemd.</p>
<p>Once <code>core.&lt;PID&gt;</code> file gets generated, you can check its back-trace to reach system crash using <code>gdb</code>.</p>
<p><code>gdb</code> speficies the core file by <code>-c</code> option, and the crashed program by <code>-f</code> option.</p>
<pre><code># gdb -c /var/lib/pgdata/core.134680 -f /usr/pgsql-10/bin/postgres
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7_4.1
       :
(gdb) bt
#0  0x00007fb942af3903 in __epoll_wait_nocancel () from /lib64/libc.so.6
#1  0x00000000006f71ae in WaitEventSetWaitBlock (nevents=1,
    occurred_events=0x7ffee51e1d70, cur_timeout=-1, set=0x2833298)
    at latch.c:1048
#2  WaitEventSetWait (set=0x2833298, timeout=timeout@entry-1,
    occurred_events=occurred_events@entry0x7ffee51e1d70,
    nevents=nevents@entry1, wait_event_info=wait_event_info@entry100663296)
    at latch.c:1000
#3  0x00000000006210fb in secure_read (port=0x2876120,
    ptr=0xcaa7e0 &lt;PqRecvBuffer&gt;, len=8192) at be-secure.c:166
#4  0x000000000062b6e8 in pq_recvbuf () at pqcomm.c:963
#5  0x000000000062c345 in pq_getbyte () at pqcomm.c:1006
#6  0x0000000000718682 in SocketBackend (inBuf=0x7ffee51e1ef0)
    at postgres.c:328
#7  ReadCommand (inBuf=0x7ffee51e1ef0) at postgres.c:501
#8  PostgresMain (argc=&lt;optimized out&gt;, argv=argv@entry0x287bb68,
    dbname=0x28333f8 &quot;postgres&quot;, username=&lt;optimized out&gt;) at postgres.c:4030
#9  0x000000000047adbc in BackendRun (port=0x2876120) at postmaster.c:4405
#10 BackendStartup (port=0x2876120) at postmaster.c:4077
#11 ServerLoop () at postmaster.c:1755
#12 0x00000000006afb7f in PostmasterMain (argc=argc@entry3,
    argv=argv@entry0x2831280) at postmaster.c:1363
#13 0x000000000047bbef in main (argc=3, argv=0x2831280) at main.c:228
</code></pre>

<p><code>bt</code> command of <code>gdb</code> displays the backtrace.
In this case, I sent <code>SIGSEGV</code> signal to the PostgreSQL backend which is waiting for queries from the client for intentional crash, the process got crashed at <code>__epoll_wait_nocancel</code> invoked by <code>WaitEventSetWait</code>.</p>
<h2 id="checking-the-backtrace-on-gpu">Checking the backtrace on GPU</h2>
<p>Crash dump of GPU kernel is generated on the current working directory of PostgreSQL server process, unless you don't specify the path using <code>CUDA_COREDUMP_FILE</code> environment variable explicitly.
Check <code>/var/lib/pgdata</code> where the database cluster is deployed, if systemd started PostgreSQL. Dump file will have the following naming convension.</p>
<p><code>core_&lt;timestamp&gt;_&lt;hostname&gt;_&lt;PID&gt;.nvcudmp</code></p>
<p>Note that the dump-file of GPU kernel contains no debug information like symbol information in the default configuration.
It is nearly impossible to investigate the problem, so enable inclusion of debug information for the GPU programs generated by PG-Strom, as follows.</p>
<p>Also note than we don't recommend to turn on the configuration for daily usage, because it makes query execution performan slow down.
Turn on only when you investigate the troubles.</p>
<pre><code>nvme=# set pg_strom.debug_jit_compile_options = on;
SET
</code></pre>

<p>You can check crash dump of the GPU kernel using <code>cuda-gdb</code> command.</p>
<pre><code># /usr/local/cuda/bin/cuda-gdb
NVIDIA (R) CUDA Debugger
9.1 release
Portions Copyright (C) 2007-2017 NVIDIA Corporation
        :
For help, type &quot;help&quot;.
Type &quot;apropos word&quot; to search for commands related to &quot;word&quot;.
(cuda-gdb)
</code></pre>

<p>Run <code>cuda-gdb</code> command, then load the crash dump file above using <code>target</code> command on the prompt.</p>
<pre><code>(cuda-gdb) target cudacore /var/lib/pgdata/core_1521131828_magro.heterodb.com_216238.nvcudmp
Opening GPU coredump: /var/lib/pgdata/core_1521131828_magro.heterodb.com_216238.nvcudmp
[New Thread 216240]

CUDA Exception: Warp Illegal Address
The exception was triggered at PC 0x7ff4dc82f930 (cuda_gpujoin.h:1159)
[Current focus set to CUDA kernel 0, grid 1, block (0,0,0), thread (0,0,0), device 0, sm 0, warp 0, lane 0]
#0  0x00007ff4dc82f938 in _INTERNAL_8_pg_strom_0124cb94::gpujoin_exec_hashjoin (kcxt=0x7ff4f7fffbf8, kgjoin=0x7fe9f4800078,
    kmrels=0x7fe9f8800000, kds_src=0x7fe9f0800030, depth=3, rd_stack=0x7fe9f4806118, wr_stack=0x7fe9f480c118, l_state=0x7ff4f7fffc48,
    matched=0x7ff4f7fffc7c &quot;&quot;) at /usr/pgsql-10/share/extension/cuda_gpujoin.h:1159
1159            while (khitem &amp;&amp; khitem-&gt;hash != hash_value)
</code></pre>

<p>You can check backtrace where the error happened on GPU kernel using <code>bt</code> command.</p>
<pre><code>(cuda-gdb) bt
#0  0x00007ff4dc82f938 in _INTERNAL_8_pg_strom_0124cb94::gpujoin_exec_hashjoin (kcxt=0x7ff4f7fffbf8, kgjoin=0x7fe9f4800078,
    kmrels=0x7fe9f8800000, kds_src=0x7fe9f0800030, depth=3, rd_stack=0x7fe9f4806118, wr_stack=0x7fe9f480c118, l_state=0x7ff4f7fffc48,
    matched=0x7ff4f7fffc7c &quot;&quot;) at /usr/pgsql-10/share/extension/cuda_gpujoin.h:1159
#1  0x00007ff4dc9428f0 in gpujoin_main&lt;&lt;&lt;(30,1,1),(256,1,1)&gt;&gt;&gt; (kgjoin=0x7fe9f4800078, kmrels=0x7fe9f8800000, kds_src=0x7fe9f0800030,
    kds_dst=0x7fe9e8800030, kparams_gpreagg=0x0) at /usr/pgsql-10/share/extension/cuda_gpujoin.h:1347
</code></pre>

<p>Please check <a href="http://docs.nvidia.com/cuda/cuda-gdb/">CUDA Toolkit Documentation - CUDA-GDB</a> for more detailed usage of <code>cuda-gdb</code> command.</p>
              
            </div>
          </div>
          <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="../ssd2gpu/" class="btn btn-neutral float-right" title="SSD2GPU Direct SQL">Next <span class="icon icon-circle-arrow-right"></span></a>
      
      
        <a href="../partition/" class="btn btn-neutral" title="Partitioning"><span class="icon icon-circle-arrow-left"></span> Previous</a>
      
    </div>
  

  <hr/>

  <div role="contentinfo">
    <!-- Copyright etc -->
    
  </div>

  Built with <a href="http://www.mkdocs.org">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
      
        </div>
      </div>

    </section>

  </div>

  <div class="rst-versions" role="note" style="cursor: pointer">
    <span class="rst-current-version" data-toggle="rst-current-version">
      
      
        <span><a href="../partition/" style="color: #fcfcfc;">&laquo; Previous</a></span>
      
      
        <span style="margin-left: 15px"><a href="../ssd2gpu/" style="color: #fcfcfc">Next &raquo;</a></span>
      
    </span>
</div>
    <script>var base_url = '..';</script>
    <script src="../js/theme.js" defer></script>
      <script src="../search/main.js" defer></script>

</body>
</html>
